from plotly.offline import init_notebook_mode, iplot
from sqlalchemy import create_engine
from IPython.display import display
import chart_studio.plotly as py
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import dask.dataframe as dd
from datetime import datetime
import pandas as pd
import hvplot.dask
import os.path
import pathlib
import pyodbc
import plotly.express as px
import seaborn as sns
file='full.csv'
required_columns=['FL_DATE','DISTANCE','DEP_DELAY','OP_CARRIER','ARR_DELAY','CANCELLED']
dask_df = dd.read_csv(file, usecols=required_columns)
dask_df['DISTANCE'].mean().compute()
dask_df.describe()
dask_df = dask_df.mask(dask_df == 'UA','United Airlines')
dask_df = dask_df.mask(dask_df == 'XE','JSX')
dask_df = dask_df.mask(dask_df == 'AS','Alaska Airlines')
dask_df = dask_df.mask(dask_df == '9E','Endeavor Air')
dask_df = dask_df.mask(dask_df == 'B6','JetBlue Airways')
dask_df = dask_df.mask(dask_df == 'EV','ExpressJet')
dask_df = dask_df.mask(dask_df == 'F9','Frontier Airlines')
dask_df = dask_df.mask(dask_df == 'G4','Allegiant Air')
dask_df = dask_df.mask(dask_df == 'HA','Hawaiian Airlines')
dask_df = dask_df.mask(dask_df == 'MQ','Envoy Air')
dask_df = dask_df.mask(dask_df == 'NK','Spirit Airlines')
dask_df = dask_df.mask(dask_df == 'OH','PSA Airlines')
dask_df = dask_df.mask(dask_df == 'OO','SkyWest Airlines')
dask_df = dask_df.mask(dask_df == 'VX','Virgin America')
dask_df = dask_df.mask(dask_df == 'WN','Southwest Airlines')
dask_df = dask_df.mask(dask_df == 'YV','Mesa Airline')
dask_df = dask_df.mask(dask_df == 'YX','Republic Airways')
dask_df = dask_df.mask(dask_df == 'AA','American Airlines')
dask_df = dask_df.mask(dask_df == 'DL','Delta Airlines')
# Arrival Delays by airlines
series = dask_df.groupby('OP_CARRIER').ARR_DELAY.sum().nlargest(23).compute()
iplot([go.Bar(x=series.index, y=series.values, marker=dict(color='orange'), opacity=1)], filename='Opóźnienia przylotów przez linie lotnicze')
# Departue Delay by airlines
series = dask_df.groupby('OP_CARRIER').DEP_DELAY.sum().nlargest(23).compute()
iplot([go.Bar(x=series.index, y=series.values, marker=dict(color='orange'), opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
# Najpopularniejszy przewoźnik
series = dask_df['OP_CARRIER'].value_counts().compute()
iplot([go.Bar(x=series.index, y=series.values, marker=dict(color='orange'), opacity=1)], filename='Najpopularniejszy przewoźnik')
dask_df.nlargest(20,'DISTANCE').compute()
dask_df.nsmallest(20,'DISTANCE').compute()
# łączna pokonana odległość przez przewoźnika
series = dask_df.groupby('OP_CARRIER').DISTANCE.sum().nlargest(23).compute()
iplot([go.Bar(x=series.index, y=series.values, marker=dict(color='orange'), opacity=1)], filename='łączna pokonana odległość przez przewoźnika')
# odlowania 2009-2020
series = dask_df.groupby('FL_DATE').CANCELLED.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values,mode='lines', opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
# odlowania 2009-2020
series = dask_df.groupby('FL_DATE').DEP_DELAY.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values, mode='lines', opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
# opoznienie odlotu 2009-2020
series = dask_df.groupby('FL_DATE').ARR_DELAY.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values,mode='lines', opacity=1)], filename='Opóźnienia przylotu przez linie lotnicze')
file='20192020.csv'
required_columns=['FL_DATE','DISTANCE','DEP_DELAY','ARR_DELAY','CANCELLED']
dask_df = dd.read_csv(file, usecols=required_columns)
# odlowania 2020
series = dask_df.groupby('FL_DATE').CANCELLED.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values,mode='lines', opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
# odlowania 2019-2020
series = dask_df.groupby('FL_DATE').DEP_DELAY.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values, mode='lines', opacity=1)], filename='Opóźnienia odlotów przez linie lotnicze')
# opoznienie odlotu 2019-2020
series = dask_df.groupby('FL_DATE').ARR_DELAY.sum().compute()
iplot([go.Scatter(x=series.index, y=series.values, mode='lines', opacity=1)], filename='Opóźnienia przylotu przez linie lotnicze')